InĀ [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
InĀ [13]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target
df
Out[13]:
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | 1.5603 | 25.0 | 5.045455 | 1.133333 | 845.0 | 2.560606 | 39.48 | -121.09 | 0.781 |
| 20636 | 2.5568 | 18.0 | 6.114035 | 1.315789 | 356.0 | 3.122807 | 39.49 | -121.21 | 0.771 |
| 20637 | 1.7000 | 17.0 | 5.205543 | 1.120092 | 1007.0 | 2.325635 | 39.43 | -121.22 | 0.923 |
| 20638 | 1.8672 | 18.0 | 5.329513 | 1.171920 | 741.0 | 2.123209 | 39.43 | -121.32 | 0.847 |
| 20639 | 2.3886 | 16.0 | 5.254717 | 1.162264 | 1387.0 | 2.616981 | 39.37 | -121.24 | 0.894 |
20640 rows Ć 9 columns
InĀ [14]:
fig, axes = plt.subplots(3, 3, figsize=(10, 5))
for i, column in enumerate(df.columns):
df[column].plot(kind='hist', ax=axes[i // 3, i % 3], title=f'Histogram of {column}', bins=30)
plt.tight_layout()
plt.show()
InĀ [15]:
from sklearn.preprocessing import StandardScaler
df['AveRooms'] = StandardScaler().fit_transform(df['AveRooms'].values.reshape(-1, 1))
df['AveBedrms'] = StandardScaler().fit_transform(df['AveBedrms'].values.reshape(-1, 1))
df
Out[15]:
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 0.628559 | -0.153758 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 0.327041 | -0.263336 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 1.155620 | -0.049016 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 0.156966 | -0.049833 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 0.344711 | -0.032906 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | 1.5603 | 25.0 | -0.155023 | 0.077354 | 845.0 | 2.560606 | 39.48 | -121.09 | 0.781 |
| 20636 | 2.5568 | 18.0 | 0.276881 | 0.462365 | 356.0 | 3.122807 | 39.49 | -121.21 | 0.771 |
| 20637 | 1.7000 | 17.0 | -0.090318 | 0.049414 | 1007.0 | 2.325635 | 39.43 | -121.22 | 0.923 |
| 20638 | 1.8672 | 18.0 | -0.040211 | 0.158778 | 741.0 | 2.123209 | 39.43 | -121.32 | 0.847 |
| 20639 | 2.3886 | 16.0 | -0.070443 | 0.138403 | 1387.0 | 2.616981 | 39.37 | -121.24 | 0.894 |
20640 rows Ć 9 columns
InĀ [16]:
from sklearn.model_selection import train_test_split
y = df['MedHouseVal']
X = df.drop(['MedHouseVal'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
Out[16]:
((16512, 8), (4128, 8))
InĀ [17]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
Out[17]:
0.5757877060324511
InĀ [18]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr.score(X_test, y_test)
Out[18]:
0.6184586450052418
InĀ [19]:
from sklearn.tree import plot_tree
plt.figure(figsize=(14, 7))
plot_tree(dtr, filled=True, feature_names=X_test.columns, max_depth=3, proportion=True, fontsize=8)
plt.show()
InĀ [20]:
import dtreeviz
viz_rmodel = dtreeviz.model(dtr, X_test, y_test, target_name='MedHouseVal', feature_names=X_test.columns)
viz_rmodel.view(depth_range_to_display=(1, 5), orientation='LR', scale=1.5)
x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\numpy\_core\fromnumeric.py:3596: RuntimeWarning: Mean of empty slice. x:\_Netology\DS_ModelsParametersModule1\env\Lib\site-packages\numpy\_core\_methods.py:138: RuntimeWarning: invalid value encountered in scalar divide
Out[20]:
InĀ [44]:
import random
from pprint import pprint
best_choice = {}
for cnt in range(1, 600):
dtr = DecisionTreeRegressor(max_depth=random.randint(1, 20),
min_samples_leaf=random.randint(1, 30),
max_features=random.randint(1, 8))
dtr.fit(X_train, y_train)
res_scores = dtr.score(X_test, y_test)
ini_scores = dtr.score(X_train, y_train)
if res_scores > best_choice.get('score', 0):
print(f"Best Score: {res_scores:.6} | Initial Score: {ini_scores:.6}")
best_choice['score'] = res_scores
best_choice['model'] = dtr
print()
pprint(best_choice)
Best Score: 0.680757 | Initial Score: 0.770631
Best Score: 0.681313 | Initial Score: 0.900481
Best Score: 0.701296 | Initial Score: 0.840353
Best Score: 0.736389 | Initial Score: 0.812619
Best Score: 0.738296 | Initial Score: 0.841445
Best Score: 0.748956 | Initial Score: 0.882894
Best Score: 0.756807 | Initial Score: 0.82948
{'model': DecisionTreeRegressor(max_depth=13, max_features=5, min_samples_leaf=18),
'score': 0.7568068658250048}